notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from sklearn import * 
import matplotlib.pyplot as plt

%matplotlib inline



In [2]:

    
df = pd.read_csv("/data/credit-default.csv")
df.head()









    Out[2]:







  
    
      
      checking_balance
      months_loan_duration
      credit_history
      purpose
      amount
      savings_balance
      employment_length
      installment_rate
      personal_status
      other_debtors
      ...
      property
      age
      installment_plan
      housing
      existing_credits
      default
      dependents
      telephone
      foreign_worker
      job
    
  
  
    
      0
      < 0 DM
      6
      critical
      radio/tv
      1169
      unknown
      > 7 yrs
      4
      single male
      none
      ...
      real estate
      67
      none
      own
      2
      1
      1
      yes
      yes
      skilled employee
    
    
      1
      1 - 200 DM
      48
      repaid
      radio/tv
      5951
      < 100 DM
      1 - 4 yrs
      2
      female
      none
      ...
      real estate
      22
      none
      own
      1
      2
      1
      none
      yes
      skilled employee
    
    
      2
      unknown
      12
      critical
      education
      2096
      < 100 DM
      4 - 7 yrs
      2
      single male
      none
      ...
      real estate
      49
      none
      own
      1
      1
      2
      none
      yes
      unskilled resident
    
    
      3
      < 0 DM
      42
      repaid
      furniture
      7882
      < 100 DM
      4 - 7 yrs
      2
      single male
      guarantor
      ...
      building society savings
      45
      none
      for free
      1
      1
      2
      none
      yes
      skilled employee
    
    
      4
      < 0 DM
      24
      delayed
      car (new)
      4870
      < 100 DM
      1 - 4 yrs
      3
      single male
      none
      ...
      unknown/none
      53
      none
      for free
      2
      2
      2
      none
      yes
      skilled employee
    
  

5 rows × 21 columns



In [3]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_length       1000 non-null object
installment_rate        1000 non-null int64
personal_status         1000 non-null object
other_debtors           1000 non-null object
residence_history       1000 non-null int64
property                1000 non-null object
age                     1000 non-null int64
installment_plan        1000 non-null object
housing                 1000 non-null object
existing_credits        1000 non-null int64
default                 1000 non-null int64
dependents              1000 non-null int64
telephone               1000 non-null object
foreign_worker          1000 non-null object
job                     1000 non-null object
dtypes: int64(8), object(13)
memory usage: 164.1+ KB



In [4]:

    
df.default.value_counts()









    Out[4]:





1    700
2    300
Name: default, dtype: int64



In [5]:

    
target = "default"
label_encoder = preprocessing.LabelEncoder()

y = label_encoder.fit_transform(df[target])
X = df.drop(columns=[target])
X.head()









    Out[5]:







  
    
      
      checking_balance
      months_loan_duration
      credit_history
      purpose
      amount
      savings_balance
      employment_length
      installment_rate
      personal_status
      other_debtors
      residence_history
      property
      age
      installment_plan
      housing
      existing_credits
      dependents
      telephone
      foreign_worker
      job
    
  
  
    
      0
      < 0 DM
      6
      critical
      radio/tv
      1169
      unknown
      > 7 yrs
      4
      single male
      none
      4
      real estate
      67
      none
      own
      2
      1
      yes
      yes
      skilled employee
    
    
      1
      1 - 200 DM
      48
      repaid
      radio/tv
      5951
      < 100 DM
      1 - 4 yrs
      2
      female
      none
      2
      real estate
      22
      none
      own
      1
      1
      none
      yes
      skilled employee
    
    
      2
      unknown
      12
      critical
      education
      2096
      < 100 DM
      4 - 7 yrs
      2
      single male
      none
      3
      real estate
      49
      none
      own
      1
      2
      none
      yes
      unskilled resident
    
    
      3
      < 0 DM
      42
      repaid
      furniture
      7882
      < 100 DM
      4 - 7 yrs
      2
      single male
      guarantor
      4
      building society savings
      45
      none
      for free
      1
      2
      none
      yes
      skilled employee
    
    
      4
      < 0 DM
      24
      delayed
      car (new)
      4870
      < 100 DM
      1 - 4 yrs
      3
      single male
      none
      4
      unknown/none
      53
      none
      for free
      2
      2
      none
      yes
      skilled employee



In [6]:

    
cat_columns = [field for field in dict(X.dtypes) 
               if X.dtypes[field] == "object"]
cat_columns









    Out[6]:





['checking_balance',
 'credit_history',
 'purpose',
 'savings_balance',
 'employment_length',
 'personal_status',
 'other_debtors',
 'property',
 'installment_plan',
 'housing',
 'telephone',
 'foreign_worker',
 'job']



In [7]:

    
num_columns = [field for field in dict(X.dtypes) 
               if X.dtypes[field] != "object"]
num_columns









    Out[7]:





['months_loan_duration',
 'amount',
 'installment_rate',
 'residence_history',
 'age',
 'existing_credits',
 'dependents']



In [8]:

    
cat_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='constant'
                                     , fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='error'
                                           , drop="first"))
]) 

num_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('poly', preprocessing.PolynomialFeatures(degree=1
                                    , include_bias=False)),
    ('scaler', preprocessing.StandardScaler()),
])

preprocessing_pipe = compose.ColumnTransformer([
    ("cat", cat_pipe, cat_columns),
    ("num", num_pipe, num_columns)
])

Simple logistic regression



In [9]:

    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", linear_model.LogisticRegression(random_state=1
                                , solver="liblinear"))
])


param_grid = {
    "est__C": np.random.random(10) + 1
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_
        , "Best parameters: ", gsearch.best_params_)









    



Fitting 5 folds for each of 10 candidates, totalling 50 fits






    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.






    



Best score:  0.755 Best parameters:  {'est__C': 1.557903180725127}






    



[Parallel(n_jobs=8)]: Done  35 out of  50 | elapsed:    1.5s remaining:    0.7s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.6s finished

Ensemble Classifier



In [10]:

    
log_clf = linear_model.LogisticRegression(C = 1.53
                            , solver= "liblinear", random_state=1) 
rnd_clf = ensemble.RandomForestClassifier(max_depth=6
                            , n_estimators = 30, random_state=1) 
svm_clf = svm.SVC(C = 1.0, gamma = 0.15, random_state=1) 


estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.VotingClassifier(voting="hard", estimators=
                                      [('lr', log_clf), 
                                       ('rf', rnd_clf), 
                                       ('svm', svm_clf)
                                      ])
    )
])


param_grid = {
    "est__svm__C": np.linspace(1.0, 20, 10)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                    , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)









    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.






    



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best score:  0.765 Best parameters:  {'est__svm__C': 5.222222222222222}






    



[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.1s finished

AdaBoost Classifier



In [11]:

    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.AdaBoostClassifier(
          linear_model.LogisticRegression(random_state=1
                                          , solver="liblinear")
        , n_estimators=200
        , algorithm="SAMME.R"
        , learning_rate=0.051)

    )
])


param_grid = {
    "est__base_estimator__C": np.random.random(10) + 1
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                                , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)









    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.






    



Fitting 5 folds for each of 10 candidates, totalling 50 fits






    



[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    3.7s finished






    



Best score:  0.734 Best parameters:  {'est__base_estimator__C': 1.0258494070869997}

Bagging classifier



In [12]:

    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.BaggingClassifier(
                tree.DecisionTreeClassifier(), 
                max_samples= 0.5,
                n_estimators=50,
                bootstrap=True, 
                oob_score=True)
    )
])


param_grid = {
    "est__base_estimator__max_depth": np.arange(5, 15)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)









    



Fitting 5 folds for each of 10 candidates, totalling 50 fits






    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.






    



Best score:  0.757 Best parameters:  {'est__base_estimator__max_depth': 12}






    



[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.1s finished

Gradient Boosted Model



In [18]:

    
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.GradientBoostingClassifier(random_state=1))
])


param_grid = {
    "est__max_depth": np.arange(3, 10),
    "est__learning_rate": np.linspace(0.01, 1, 10)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_
        , "Best parameters: ", gsearch.best_params_)









    



Fitting 5 folds for each of 70 candidates, totalling 350 fits






    



[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    4.5s
[Parallel(n_jobs=8)]: Done 205 tasks      | elapsed:   14.8s






    



Best score:  0.76 Best parameters:  {'est__learning_rate': 0.12, 'est__max_depth': 3}






    



[Parallel(n_jobs=8)]: Done 350 out of 350 | elapsed:   20.8s finished



In [23]:

    
scores = pd.DataFrame(gsearch.cv_results_)
scores.head()









    Out[23]:







  
    
      
      mean_fit_time
      std_fit_time
      mean_score_time
      std_score_time
      param_est__learning_rate
      param_est__max_depth
      params
      split0_test_score
      split1_test_score
      split2_test_score
      split3_test_score
      split4_test_score
      mean_test_score
      std_test_score
      rank_test_score
    
  
  
    
      0
      0.178265
      0.008670
      0.006912
      0.001198
      0.01
      3
      {'est__learning_rate': 0.01, 'est__max_depth': 3}
      0.705
      0.705
      0.710
      0.685
      0.705
      0.702
      0.008718
      70
    
    
      1
      0.315358
      0.017413
      0.006609
      0.001425
      0.01
      4
      {'est__learning_rate': 0.01, 'est__max_depth': 4}
      0.730
      0.740
      0.720
      0.700
      0.710
      0.720
      0.014142
      67
    
    
      2
      0.482575
      0.019085
      0.008350
      0.000537
      0.01
      5
      {'est__learning_rate': 0.01, 'est__max_depth': 5}
      0.725
      0.750
      0.715
      0.730
      0.730
      0.730
      0.011402
      60
    
    
      3
      0.660290
      0.010609
      0.006561
      0.001300
      0.01
      6
      {'est__learning_rate': 0.01, 'est__max_depth': 6}
      0.730
      0.775
      0.725
      0.730
      0.720
      0.736
      0.019849
      47
    
    
      4
      0.838601
      0.033753
      0.006765
      0.000963
      0.01
      7
      {'est__learning_rate': 0.01, 'est__max_depth': 7}
      0.755
      0.765
      0.700
      0.725
      0.725
      0.734
      0.023324
      52



In [24]:

    
scores[scores.rank_test_score == 1]









    Out[24]:







  
    
      
      mean_fit_time
      std_fit_time
      mean_score_time
      std_score_time
      param_est__learning_rate
      param_est__max_depth
      params
      split0_test_score
      split1_test_score
      split2_test_score
      split3_test_score
      split4_test_score
      mean_test_score
      std_test_score
      rank_test_score
    
  
  
    
      7
      0.190787
      0.010879
      0.007576
      0.000844
      0.12
      3
      {'est__learning_rate': 0.12, 'est__max_depth': 3}
      0.755
      0.765
      0.785
      0.745
      0.75
      0.76
      0.014142
      1



In [ ]:

	checking_balance	months_loan_duration	credit_history	purpose	amount	savings_balance	employment_length	installment_rate	personal_status	other_debtors	...	property	age	installment_plan	housing	existing_credits	default	dependents	telephone	foreign_worker	job
0	< 0 DM	6	critical	radio/tv	1169	unknown	> 7 yrs	4	single male	none	...	real estate	67	none	own	2	1	1	yes	yes	skilled employee
1	1 - 200 DM	48	repaid	radio/tv	5951	< 100 DM	1 - 4 yrs	2	female	none	...	real estate	22	none	own	1	2	1	none	yes	skilled employee
2	unknown	12	critical	education	2096	< 100 DM	4 - 7 yrs	2	single male	none	...	real estate	49	none	own	1	1	2	none	yes	unskilled resident
3	< 0 DM	42	repaid	furniture	7882	< 100 DM	4 - 7 yrs	2	single male	guarantor	...	building society savings	45	none	for free	1	1	2	none	yes	skilled employee
4	< 0 DM	24	delayed	car (new)	4870	< 100 DM	1 - 4 yrs	3	single male	none	...	unknown/none	53	none	for free	2	2	2	none	yes	skilled employee

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_est__learning_rate	param_est__max_depth	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
0	0.178265	0.008670	0.006912	0.001198	0.01	3	{'est__learning_rate': 0.01, 'est__max_depth': 3}	0.705	0.705	0.710	0.685	0.705	0.702	0.008718	70
1	0.315358	0.017413	0.006609	0.001425	0.01	4	{'est__learning_rate': 0.01, 'est__max_depth': 4}	0.730	0.740	0.720	0.700	0.710	0.720	0.014142	67
2	0.482575	0.019085	0.008350	0.000537	0.01	5	{'est__learning_rate': 0.01, 'est__max_depth': 5}	0.725	0.750	0.715	0.730	0.730	0.730	0.011402	60
3	0.660290	0.010609	0.006561	0.001300	0.01	6	{'est__learning_rate': 0.01, 'est__max_depth': 6}	0.730	0.775	0.725	0.730	0.720	0.736	0.019849	47
4	0.838601	0.033753	0.006765	0.000963	0.01	7	{'est__learning_rate': 0.01, 'est__max_depth': 7}	0.755	0.765	0.700	0.725	0.725	0.734	0.023324	52